#imports
import nltk
from utils.data.readCorpus import NltkCorpusFromDir
import pandas as pd
import numpy as np
import plotly.express as px
from scipy.stats import variation
#prepare the corpus
latinise = NltkCorpusFromDir(root="/home/krzys/Kod/streamlit/voces/data/corpora/latinise_IT_lemmas", fileids=r".*\.txt")
#prepare corpus metadata
filenames = latinise.fileids()
filenames = pd.DataFrame([(fname, fname.split('_')[2]) for fname in filenames], columns=["filename","id"])
metadata = pd.read_csv("/media/HOME_FOLDERS/krzys/Kod/lvlt22/BMG/latinise_metadata.csv", index_col="id")
metadata = metadata.merge(filenames,on="id")
metadata = metadata.drop_duplicates('id')
metadata = metadata.set_index('filename')
metadata["no_tokens"] = [ len(latinise.words(filename)) if filename in latinise.fileids() else 0 in filename for filename in metadata.index.tolist() ]
# cut the corpus
metadata = metadata[metadata["no_tokens"] > 0]
bins_test = []
for binstep in range(1,500,1):
for binstart in range(-450,0,binstep):
bins=(range(binstart,850,binstep))
tmp = pd.DataFrame(data={'count':metadata["no_tokens"],
'period':pd.cut(metadata["date"], bins=bins, include_lowest=True)}).dropna()
bins_test.append([binstart, binstep, len(bins),
tmp.reset_index().groupby("period")["filename"].count(),
tmp.reset_index().groupby("period")["count"].sum()])
# prepare the df
bins_df = pd.DataFrame(bins_test)
bins_df.columns = ["start", "step", "length", "files", "tokens"]
# compute variation of file counts per period
bins_df["var_files"] = bins_df.apply(lambda x: variation(x["files"]), axis=1)
# compute variation of token counts per period
bins_df["var_tokens"] = bins_df.apply(lambda x: variation(x["tokens"]), axis=1)
bins_df[(bins_df["start"] == -450) & (bins_df["length"] > 3)].sort_values("var_tokens", ascending=True).head()
| start | step | length | files | tokens | var_files | var_tokens | |
|---|---|---|---|---|---|---|---|
| 2826 | -450 | 233 | 6 | period (-450.001, -217.0] 2 (-217.0, 16.0]... | period (-450.001, -217.0] 803 (-217.0, ... | 0.586747 | 0.564911 |
| 2824 | -450 | 232 | 6 | period (-450.001, -218.0] 2 (-218.0, 14.0]... | period (-450.001, -218.0] 803 (-218.0, ... | 0.586747 | 0.564911 |
| 2822 | -450 | 231 | 6 | period (-450.001, -219.0] 2 (-219.0, 12.0]... | period (-450.001, -219.0] 803 (-219.0, ... | 0.587592 | 0.565472 |
| 2812 | -450 | 226 | 6 | period (-450.001, -224.0] 2 (-224.0, 2.0] ... | period (-450.001, -224.0] 803 (-224.0, ... | 0.609080 | 0.567348 |
| 2814 | -450 | 227 | 6 | period (-450.001, -223.0] 2 (-223.0, 4.0] ... | period (-450.001, -223.0] 803 (-223.0, ... | 0.609080 | 0.567348 |
# plot the results
fig = px.line(bins_df[(bins_df["start"] == -450)], x="step", y=["var_tokens","var_files"], hover_data=["tokens", "files"])
fig.show()